library(tidyverse)
library(scales)
library(svglite)

## Read in the language filtered dataset - indo_dt - and the tweets in the vertical
## bands - vblr - and create a df of just the original tweets mentioning
## special autonomy in vblr - vblr_orig_otsus

indo_dt <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220729_indo_dt.rds")

vblr <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220719_vblr.rds")

vblr_orig_otsus <- vblr %>% 
  filter(is_original & otsus)

## Second, create a dataframe with the tallies of content codes for all tweets,
## original tweets, original otsus tweets (tally of actual tweets), 
## as well as [because it's a second possible
## way of counting how many tweets] generate the same tallies only for distinct
## components (tally of sets of duplicates). 

all_cc_tallies <- vblr %>% 
  count(content_code, sort = FALSE) %>% 
  rename(all_n = n)

orig_cc_tallies <- vblr %>% 
  filter(is_original) %>% 
  count(content_code, sort = FALSE) %>% 
  rename(orig_n = n)

orig_otsus_cc_tallies <- vblr_orig_otsus %>% 
  count(content_code, sort = FALSE) %>% 
  rename(orig_otsus_n = n)

all_cc_distinct_tallies <- vblr %>% 
  distinct(vblr_0.69_jaccard_components, .keep_all = TRUE) %>% 
  count(content_code, sort = FALSE) %>% 
  rename(all_d_n = n)

orig_cc_distinct_tallies <- vblr %>% 
  filter(is_original) %>% 
  distinct(vblr_0.69_jaccard_components, .keep_all = TRUE) %>% 
  count(content_code, sort = FALSE) %>% 
  rename(orig_d_n = n)

orig_otsus_cc_distinct_tallies <- vblr_orig_otsus %>% 
  distinct(vblr_0.69_jaccard_components, .keep_all = TRUE) %>% 
  count(content_code, sort = FALSE) %>% 
  rename(orig_otsus_d_n = n)

# now merge all of those df into just the one cc_complete
cc_complete <- merge(x = all_cc_tallies, y = all_cc_distinct_tallies, 
                     by.x = c("content_code"), 
                     by.y = c("content_code"), all.x = TRUE, all.Y = TRUE)

cc_complete <- merge(x = cc_complete, y = orig_cc_tallies, 
                     by.x = c("content_code"), 
                     by.y = c("content_code"), all.x = TRUE, all.Y = TRUE)

cc_complete <- merge(x = cc_complete, y = orig_cc_distinct_tallies, 
                     by.x = c("content_code"), 
                     by.y = c("content_code"), all.x = TRUE, all.Y = TRUE)

cc_complete <- merge(x = cc_complete, y = orig_otsus_cc_tallies, 
                     by.x = c("content_code"), 
                     by.y = c("content_code"), all.x = TRUE, all.Y = TRUE)

cc_complete <- merge(x = cc_complete, y = orig_otsus_cc_distinct_tallies, 
                     by.x = c("content_code"), 
                     by.y = c("content_code"), all.x = TRUE, all.Y = TRUE)

#finally, add three columns with rowwise counts of tweets per distinct tweet
cc_complete <- cc_complete %>% rowwise() %>% mutate(all_per = all_n/all_d_n, 
                                                    orig_per = orig_n/orig_d_n, 
                                                    orig_otsus_per = orig_otsus_n/orig_otsus_d_n) %>% 
  ungroup()

# save the resultant df
saveRDS(cc_complete, file = "/home/ubuntu/data/shared_folder/Data/cc_complete.rds")

## Now, summary stats for authors in the vertical bands

# Median followers of accounts posting original tweets in vb on otsus, and for
# a) the entire dataset b) the remainder of the dataset
vblr_orig_otsus %>% distinct(author_id, .keep_all = TRUE) %>% 
  pull(author_followers) %>% median()

indo_dt %>% distinct(author_id, .keep_all = TRUE) %>% 
  pull(author_followers) %>% median()

indo_dt %>% filter(!author_id %in% vblr_orig_otsus$author_id) %>% 
  distinct(author_id, .keep_all = TRUE) %>% 
  pull(author_followers) %>% median()

## Number of followers for twitter users who posted original tweets mentioning 
##special autonomy in the two previously identified timeslots (blue) 
## and all other authors in dataset (yellow).


test <- indo_dt %>%
  group_by(author_id) %>%
  summarise(max_followers=max(author_followers)) %>%
  mutate(followers = factor(case_when(
    max_followers == 0 ~ "0",
    max_followers <= 10 ~ "1-10",
    max_followers <= 100 ~ "11-100",
    max_followers > 100 ~ "more than 100"), 
    levels = c("0","1-10","11-100","more than 100")),
    in_vblr_oo = factor(ifelse(author_id %in% vblr_orig_otsus$author_id, 
                        "Posted original special autonomy tweet in vertical bands", "Other authors"),
                        levels = c("Posted original special autonomy tweet in vertical bands", "Other authors"))) %>%
  group_by(followers, in_vblr_oo) %>%
  summarise (n = n()) %>%
  group_by(in_vblr_oo) %>%
  mutate(perc = 100 * n / sum(n))

ggplot(test, aes(x = followers, y = perc, fill = in_vblr_oo)) + 
  geom_col(position="dodge") +
  theme_minimal() +
  scale_fill_manual(values=c("#0C7BDC", "#FFC20A")) +
  labs(x = "Number of followers", y = "Percent of accounts", fill = "") +
  theme(legend.position=c(.25,.85))

ggsave(file = "figures/6_number_followers.png", width=2400, height=1200, units="px")


## Proportion of accounts with blank author descriptions
## for entire dataset and just original tweets mentioning otsus in vertical bands.
(indo_dt %>% distinct(author_id, .keep_all = TRUE) %>% filter(author_description == "") %>% 
    nrow())/(indo_dt %>% distinct(author_id, .keep_all = TRUE) %>% nrow()) #0.1892784

(vblr_orig_otsus  %>% distinct(author_id, .keep_all = TRUE) %>% 
    filter(author_description == "") %>% nrow())/(vblr_orig_otsus  %>%
        distinct(author_id, .keep_all = TRUE)  %>% nrow()) #0.5086207

## Generating random samples of 50 authors that post more than the median number 
## of original otsus tweets in the vertical bands, equal or fewer than the median,
## and which post in the vertical bands but not original tweets mentioning otsus
## Unfortunately this is a bit more involved than it needs to be, because the samples
## were generated using a piece of analysis that was not ultimately used.

vblr_orig_otsus_no_tweets <- vblr_orig_otsus%>% group_by(author_id) %>% summarise(no_of_vblr_orig_otsus_tweets = n()) %>% 
  ungroup()
vblr_orig_otsus <-merge(x = vblr_orig_otsus, y = vblr_orig_otsus_no_tweets,
                                          by.x = c("author_id"), by.y = c("author_id"), all.x = TRUE, all.Y = FALSE)

median_no_tweets_vblr_orig_otsus <- vblr_orig_otsus %>% distinct(author_id, .keep_all = TRUE) %>% 
  pull(no_of_vblr_orig_otsus_tweets) %>% median() #3

# Now we can sample, with seed set so reproducible
# above and below/equal median are straightforward, sample of the rest needs
# a bit of wrangling to create a 'the rest' object first

set.seed(3)
above_median_50 <- vblr_orig_otsus %>% distinct(author_id, .keep_all = TRUE) %>% 
  filter(no_of_vblr_orig_otsus_tweets > 3) %>% slice_sample(n = 50) %>% 
  select(author_id, author_username, author_name, 
  author_tweets, no_of_vblr_orig_otsus_tweets, author_created, author_description,
  author_followers, author_following, author_image_url, co_similar_group)

below_median_50 <- vblr_orig_otsus %>% distinct(author_id, .keep_all = TRUE) %>% 
  filter(no_of_vblr_orig_otsus_tweets < 3 | no_of_vblr_orig_otsus_tweets == 3) %>% 
  slice_sample(n = 50) %>% 
  select(author_id, author_username, author_name, 
         author_tweets, no_of_vblr_orig_otsus_tweets, author_created, author_description,
         author_followers, author_following, author_image_url, co_similar_group)

vblr_orig_otsus_authors <-vblr_orig_otsus %>% distinct(author_id) %>% 
  pull(author_id)

vblr_the_rest <- vblr %>% distinct(author_id, .keep_all = TRUE) %>% 
  filter(!author_id %in% vblr_orig_otsus_authors)

the_rest_50 <- vblr_the_rest %>% slice_sample(n = 50) %>% 
  select(author_id, author_username, author_name, 
  author_tweets, author_created, author_description,
  author_followers, author_following, author_image_url, co_similar_group)

## Median account created dates for 
median_vblr_orig_otsus <- vblr_orig_otsus %>% distinct(author_id, .keep_all = TRUE) %>% pull(jkt_author_created) %>% median()
median_vblr_orig_otsus

(indo_dt %>% distinct(author_id, .keep_all = TRUE) %>% filter(jkt_author_created < median_vblr_orig_otsus) %>% 
    nrow())/(indo_dt %>% distinct(author_id, .keep_all = TRUE) %>% nrow()) #proportion 0.9913133

## A final task is to calculate the proportion of accounts that Twitter has suspended 
## since we scraped the dataset in June 2021

## Suspended accounts from indo_dt
account_status_october <- read.csv("/home/ubuntu/data/social-media-disinformation/tweets/20221002_not_found_or_suspended_users.csv",colClasses=c("NULL","factor",NA,"factor"))

# Pull the author_id and account status out of the detail column, because reading in the csv distorts the author_ids in their pre-existing column
account_status_october <- account_status_october %>%
  rename(author_id = value) %>%
  mutate(status = ifelse(title == "Forbidden", "User has been suspended", "Could not find user"),
         status = factor(status))
account_status_october %>% count(status, sort = TRUE)

# create a df that has binary columns for suspended accounts and accounts that twitter could not find
account_status_october <- account_status_october %>% 
  mutate(suspended = ifelse(title == "Forbidden", 1, 0), could_not_find = ifelse(title == "Not Found Error", 1, 0))

suspended_authors_october <- account_status_october %>% filter(suspended == 1) %>% pull(author_id)
could_not_find_authors_october <- account_status_october %>% filter(could_not_find == 1) %>% pull(author_id)

## percentage of tweets by suspended authors, indo_dt
idt_suspended_tweets <- indo_dt %>% filter(author_id %in% suspended_authors_october) %>% nrow()#103678
idt_total_tweets <- nrow(indo_dt)
idt_percent_tweets_suspended <- 100*idt_suspended_tweets/idt_total_tweets
idt_percent_tweets_suspended

## percentage of suspended authors, indo_dt october
idt_suspended_total <- indo_dt %>% filter(author_id %in% suspended_authors_october) %>% distinct(author_id) %>% nrow() #9438
idt_all_total <- indo_dt %>% distinct(author_id) %>% nrow() #360897

idt_percent_suspended <- 100*idt_suspended_total/idt_all_total
idt_percent_suspended

## percentage of suspended authors who posted at least 1 tweet about special autonomy in the vertical bands

vblr_orig_otsus_authors <- vblr %>% filter(otsus & is_original) %>% distinct(author_id) %>% pull(author_id)

suspended_vblr_orig_otsus_authors <- indo_dt %>% filter(author_id %in% suspended_authors_october &
                     author_id %in% vblr_orig_otsus_authors) %>% distinct(author_id) %>% nrow() #1708

suspended_vblr_orig_otsus_authors*100/idt_suspended_total #18.09705

## percentage of suspended authors who never posted any tweet in vertical bands
vblr_authors <- vblr %>% distinct(author_id) %>% pull(author_id)
no_vblr_tweets_suspended_authors <- indo_dt %>% filter(author_id %in% suspended_authors_october &
                     !author_id %in% vblr_authors) %>% distinct(author_id) %>% nrow() #7688 

no_vblr_tweets_suspended_authors*100/idt_suspended_total

## percentage of vblr authors who are suspended
100*(vblr %>% filter(author_id %in% suspended_authors_october) %>% distinct(author_id) %>% nrow())/
  length(vblr_authors) #0.4591971

